User Inputs

output.var = params$output.var 

transform.abs = FALSE
log.pred = params$log.pred
norm.pred = FALSE
eda = params$eda
algo.forward.caret = params$algo.forward.caret
algo.backward.caret = params$algo.backward.caret
algo.stepwise.caret = params$algo.stepwise.caret
algo.LASSO.caret = params$algo.LASSO.caret
algo.LARS.caret = params$algo.LARS.caret
message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 8
##  $ output.var         : chr "y3"
##  $ log.pred           : logi TRUE
##  $ eda                : logi TRUE
##  $ algo.forward.caret : logi FALSE
##  $ algo.backward.caret: logi FALSE
##  $ algo.stepwise.caret: logi FALSE
##  $ algo.LASSO.caret   : logi FALSE
##  $ algo.LARS.caret    : logi FALSE
# Setup Labels
#output.var.tr = if (log.pred == TRUE)  paste0(output.var,'.log') else  output.var.tr = output.var
output.var.tr = if (log.pred == TRUE)  paste0(output.var,'.cuberoot') else  output.var.tr = output.var

Loading Data

feat  = read.csv('../../Data/features_highprec.csv')
labels = read.csv('../../Data/labels.csv')
predictors = names(dplyr::select(feat,-JobName))
data.ori = inner_join(feat,labels,by='JobName')
#data.ori = inner_join(feat,select_at(labels,c('JobName',output.var)),by='JobName')

Data validation

cc  = complete.cases(data.ori)
data.notComplete = data.ori[! cc,]
data = data.ori[cc,] %>% select_at(c(predictors,output.var,'JobName'))
message('Original cases: ',nrow(data.ori))
## Original cases: 10000
message('Non-Complete cases: ',nrow(data.notComplete))
## Non-Complete cases: 3020
message('Complete cases: ',nrow(data))
## Complete cases: 6980
summary(dplyr::select_at(data,c('JobName',output.var)))
##       JobName           y3        
##  Job_00001:   1   Min.   : 95.91  
##  Job_00002:   1   1st Qu.:118.29  
##  Job_00003:   1   Median :124.03  
##  Job_00004:   1   Mean   :125.40  
##  Job_00007:   1   3rd Qu.:131.06  
##  Job_00008:   1   Max.   :193.73  
##  (Other)  :6974

Output Variable

The Output Variable y3 shows right skewness, so will proceed with a log transformation

Histogram

df=gather(select_at(data,output.var))
ggplot(df, aes(x=value)) + 
  geom_histogram(aes(y=..density..),bins = 50,fill='light blue') + 
  geom_density() 

  #stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))  

QQPlot

ggplot(gather(select_at(data,output.var)), aes(sample=value)) + 
  stat_qq() + 
  facet_wrap(~key, scales = 'free',ncol=4)

Trasformation of Output Variable from y3 to y3.cuberoot

#if(log.pred==TRUE) data[[output.var.tr]] = log(data[[output.var]],10) else
if(log.pred==TRUE) data[[output.var.tr]] = (data[[output.var]])^(1/3) else
  data[[output.var.tr]] = data[[output.var]]
df=gather(select_at(data,c(output.var,output.var.tr)))
ggplot(df, aes(value)) + 
  geom_histogram(aes(y=..density..),bins = 50,fill='light blue') + 
  geom_density() + 
  # stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))  
  facet_wrap(~key, scales = 'free',ncol=2)

ggplot(gather(select_at(data,c(output.var,output.var.tr))), aes(sample=value)) + 
  stat_qq() + 
  facet_wrap(~key, scales = 'free',ncol=4)

Best Normalizator y3

Normalization of y3 using bestNormalize package. (suggested orderNorm) This is cool, but I think is too far for the objective of the project

t=bestNormalize::bestNormalize(data[[output.var]])
t
## Best Normalizing transformation with 6980 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - No transform: 2.9364 
##  - Box-Cox: 1.45 
##  - Log_b(x+a): 2.0049 
##  - sqrt(x+a): 2.4487 
##  - exp(x): 749.1924 
##  - arcsinh(x): 2.0049 
##  - Yeo-Johnson: 1.1675 
##  - orderNorm: 1.1705 
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## Standardized Yeo-Johnson Transformation with 6980 nonmissing obs.:
##  Estimated statistics:
##  - lambda = -1.998639 
##  - mean (before standardization) = 0.5003083 
##  - sd (before standardization) = 5.108542e-06
qqnorm(data[[output.var]])

qqnorm(predict(t))

orderNorm() is a rank-based procedure by which the values of a vector are mapped to their percentile, which is then mapped to the same percentile of the normal distribution. Without the presence of ties, this essentially guarantees that the transformation leads to a uniform distribution

Predictors

Feature Engineering

data$x2byx1 = data$x2/data$x1
data$x6byx5 = data$x6/data$x5
data$x9byx7 = data$x9/data$x7
data$x10byx8 = data$x10/data$x8
data$x14byx12 = data$x14/data$x12
data$x15byx13 = data$x15/data$x13
data$x17byx16 = data$x17/data$x16
data$x19byx18 = data$x19/data$x18
data$x21byx20 = data$x21/data$x20
data$x23byx22 = data$x23/data$x22
data$x1log = log(data$x1)
data$x2log = log(data$x2)
data$x5log = log(data$x5)
data$x6log = log(data$x6)
data$x7log = log(data$x7)
data$x9log = log(data$x9)
data$x8log = log(data$x8)
data$x10log = log(data$x10)
data$x12log = log(data$x12)
data$x14log = log(data$x14)
data$x13log = log(data$x13)
data$x15log = log(data$x15)
data$x16log = log(data$x16)
data$x17log = log(data$x17)
data$x18log = log(data$x18)
data$x19log = log(data$x19)
data$x20log = log(data$x20)
data$x21log = log(data$x21)
data$x22log = log(data$x22)
data$x23log = log(data$x23)
data$x11log = log(data$x11)
data$x1sqinv = 1/(data$x1)^2 
data$x5sqinv = 1/(data$x5)^2 
data$x7sqinv = 1/(data$x7)^2 
data$x8sqinv = 1/(data$x8)^2 
data$x12sqinv = 1/(data$x12)^2 
data$x13sqinv = 1/(data$x13)^2 
data$x16sqinv = 1/(data$x16)^2 
data$x18sqinv = 1/(data$x18)^2 
data$x20sqinv = 1/(data$x20)^2 
data$x22sqinv = 1/(data$x22)^2 
predictors
##   [1] "x1"      "x2"      "x3"      "x4"      "x5"      "x6"      "x7"      "x8"      "x9"      "x10"     "x11"    
##  [12] "x12"     "x13"     "x14"     "x15"     "x16"     "x17"     "x18"     "x19"     "x20"     "x21"     "x22"    
##  [23] "x23"     "stat1"   "stat2"   "stat3"   "stat4"   "stat5"   "stat6"   "stat7"   "stat8"   "stat9"   "stat10" 
##  [34] "stat11"  "stat12"  "stat13"  "stat14"  "stat15"  "stat16"  "stat17"  "stat18"  "stat19"  "stat20"  "stat21" 
##  [45] "stat22"  "stat23"  "stat24"  "stat25"  "stat26"  "stat27"  "stat28"  "stat29"  "stat30"  "stat31"  "stat32" 
##  [56] "stat33"  "stat34"  "stat35"  "stat36"  "stat37"  "stat38"  "stat39"  "stat40"  "stat41"  "stat42"  "stat43" 
##  [67] "stat44"  "stat45"  "stat46"  "stat47"  "stat48"  "stat49"  "stat50"  "stat51"  "stat52"  "stat53"  "stat54" 
##  [78] "stat55"  "stat56"  "stat57"  "stat58"  "stat59"  "stat60"  "stat61"  "stat62"  "stat63"  "stat64"  "stat65" 
##  [89] "stat66"  "stat67"  "stat68"  "stat69"  "stat70"  "stat71"  "stat72"  "stat73"  "stat74"  "stat75"  "stat76" 
## [100] "stat77"  "stat78"  "stat79"  "stat80"  "stat81"  "stat82"  "stat83"  "stat84"  "stat85"  "stat86"  "stat87" 
## [111] "stat88"  "stat89"  "stat90"  "stat91"  "stat92"  "stat93"  "stat94"  "stat95"  "stat96"  "stat97"  "stat98" 
## [122] "stat99"  "stat100" "stat101" "stat102" "stat103" "stat104" "stat105" "stat106" "stat107" "stat108" "stat109"
## [133] "stat110" "stat111" "stat112" "stat113" "stat114" "stat115" "stat116" "stat117" "stat118" "stat119" "stat120"
## [144] "stat121" "stat122" "stat123" "stat124" "stat125" "stat126" "stat127" "stat128" "stat129" "stat130" "stat131"
## [155] "stat132" "stat133" "stat134" "stat135" "stat136" "stat137" "stat138" "stat139" "stat140" "stat141" "stat142"
## [166] "stat143" "stat144" "stat145" "stat146" "stat147" "stat148" "stat149" "stat150" "stat151" "stat152" "stat153"
## [177] "stat154" "stat155" "stat156" "stat157" "stat158" "stat159" "stat160" "stat161" "stat162" "stat163" "stat164"
## [188] "stat165" "stat166" "stat167" "stat168" "stat169" "stat170" "stat171" "stat172" "stat173" "stat174" "stat175"
## [199] "stat176" "stat177" "stat178" "stat179" "stat180" "stat181" "stat182" "stat183" "stat184" "stat185" "stat186"
## [210] "stat187" "stat188" "stat189" "stat190" "stat191" "stat192" "stat193" "stat194" "stat195" "stat196" "stat197"
## [221] "stat198" "stat199" "stat200" "stat201" "stat202" "stat203" "stat204" "stat205" "stat206" "stat207" "stat208"
## [232] "stat209" "stat210" "stat211" "stat212" "stat213" "stat214" "stat215" "stat216" "stat217"
controlled.vars = colnames(data)[grep("^x",colnames(data))]
stat.vars = colnames(data)[grep("^stat",colnames(data))]

predictors = c(controlled.vars,stat.vars)
predictors
##   [1] "x1"       "x2"       "x3"       "x4"       "x5"       "x6"       "x7"       "x8"       "x9"       "x10"     
##  [11] "x11"      "x12"      "x13"      "x14"      "x15"      "x16"      "x17"      "x18"      "x19"      "x20"     
##  [21] "x21"      "x22"      "x23"      "x2byx1"   "x6byx5"   "x9byx7"   "x10byx8"  "x14byx12" "x15byx13" "x17byx16"
##  [31] "x19byx18" "x21byx20" "x23byx22" "x1log"    "x2log"    "x5log"    "x6log"    "x7log"    "x9log"    "x8log"   
##  [41] "x10log"   "x12log"   "x14log"   "x13log"   "x15log"   "x16log"   "x17log"   "x18log"   "x19log"   "x20log"  
##  [51] "x21log"   "x22log"   "x23log"   "x11log"   "x1sqinv"  "x5sqinv"  "x7sqinv"  "x8sqinv"  "x12sqinv" "x13sqinv"
##  [61] "x16sqinv" "x18sqinv" "x20sqinv" "x22sqinv" "stat1"    "stat2"    "stat3"    "stat4"    "stat5"    "stat6"   
##  [71] "stat7"    "stat8"    "stat9"    "stat10"   "stat11"   "stat12"   "stat13"   "stat14"   "stat15"   "stat16"  
##  [81] "stat17"   "stat18"   "stat19"   "stat20"   "stat21"   "stat22"   "stat23"   "stat24"   "stat25"   "stat26"  
##  [91] "stat27"   "stat28"   "stat29"   "stat30"   "stat31"   "stat32"   "stat33"   "stat34"   "stat35"   "stat36"  
## [101] "stat37"   "stat38"   "stat39"   "stat40"   "stat41"   "stat42"   "stat43"   "stat44"   "stat45"   "stat46"  
## [111] "stat47"   "stat48"   "stat49"   "stat50"   "stat51"   "stat52"   "stat53"   "stat54"   "stat55"   "stat56"  
## [121] "stat57"   "stat58"   "stat59"   "stat60"   "stat61"   "stat62"   "stat63"   "stat64"   "stat65"   "stat66"  
## [131] "stat67"   "stat68"   "stat69"   "stat70"   "stat71"   "stat72"   "stat73"   "stat74"   "stat75"   "stat76"  
## [141] "stat77"   "stat78"   "stat79"   "stat80"   "stat81"   "stat82"   "stat83"   "stat84"   "stat85"   "stat86"  
## [151] "stat87"   "stat88"   "stat89"   "stat90"   "stat91"   "stat92"   "stat93"   "stat94"   "stat95"   "stat96"  
## [161] "stat97"   "stat98"   "stat99"   "stat100"  "stat101"  "stat102"  "stat103"  "stat104"  "stat105"  "stat106" 
## [171] "stat107"  "stat108"  "stat109"  "stat110"  "stat111"  "stat112"  "stat113"  "stat114"  "stat115"  "stat116" 
## [181] "stat117"  "stat118"  "stat119"  "stat120"  "stat121"  "stat122"  "stat123"  "stat124"  "stat125"  "stat126" 
## [191] "stat127"  "stat128"  "stat129"  "stat130"  "stat131"  "stat132"  "stat133"  "stat134"  "stat135"  "stat136" 
## [201] "stat137"  "stat138"  "stat139"  "stat140"  "stat141"  "stat142"  "stat143"  "stat144"  "stat145"  "stat146" 
## [211] "stat147"  "stat148"  "stat149"  "stat150"  "stat151"  "stat152"  "stat153"  "stat154"  "stat155"  "stat156" 
## [221] "stat157"  "stat158"  "stat159"  "stat160"  "stat161"  "stat162"  "stat163"  "stat164"  "stat165"  "stat166" 
## [231] "stat167"  "stat168"  "stat169"  "stat170"  "stat171"  "stat172"  "stat173"  "stat174"  "stat175"  "stat176" 
## [241] "stat177"  "stat178"  "stat179"  "stat180"  "stat181"  "stat182"  "stat183"  "stat184"  "stat185"  "stat186" 
## [251] "stat187"  "stat188"  "stat189"  "stat190"  "stat191"  "stat192"  "stat193"  "stat194"  "stat195"  "stat196" 
## [261] "stat197"  "stat198"  "stat199"  "stat200"  "stat201"  "stat202"  "stat203"  "stat204"  "stat205"  "stat206" 
## [271] "stat207"  "stat208"  "stat209"  "stat210"  "stat211"  "stat212"  "stat213"  "stat214"  "stat215"  "stat216" 
## [281] "stat217"

All predictors show a Fat-Tail situation, where the two tails are very tall, and a low distribution around the mean. The orderNorm transformation can help (see [Best Normalizator] section)

Interesting Predictors

Histograms

if (eda == TRUE){
  cols = c('x11','x18','stat98','x7','stat110')
  df=gather(select_at(data,cols))
  ggplot(df, aes(value)) + 
    geom_histogram(aes(y=..density..),bins = 50,fill='light blue') + 
    geom_density() + 
    # stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))  
    facet_wrap(~key, scales = 'free',ncol=3)
  
  # ggplot(gather(select_at(data,cols)), aes(sample=value)) + 
  #   stat_qq()+
  #   facet_wrap(~key, scales = 'free',ncol=2)
  
  lapply(select_at(data,cols),summary)
}
## $x11
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 9.000e-08 9.494e-08 1.001e-07 1.001e-07 1.052e-07 1.100e-07 
## 
## $x18
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.500   3.147   4.769   4.772   6.418   7.999 
## 
## $stat98
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -2.998619 -1.551882 -0.015993 -0.005946  1.528405  2.999499 
## 
## $x7
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.700   1.266   1.854   1.852   2.446   3.000 
## 
## $stat110
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -2.999543 -1.496865 -0.002193 -0.004129  1.504273  2.999563

Scatter plot vs. output variable **y3.cuberoot

if (eda == TRUE){
  d = gather(dplyr::select_at(data,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
  ggplot(data=d, aes_string(x='value',y=output.var.tr)) + 
    geom_point(color='light green',alpha=0.5) + 
    geom_smooth() + 
    facet_wrap(~target, scales = 'free',ncol=3)
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

All Predictors

Histograms

All indicators have a strong indication of Fat-Tails

if (eda == TRUE){
  df=gather(select_at(data,predictors))
  ggplot(df, aes(value)) + 
    geom_histogram(aes(y=..density..),bins = 50,fill='light blue') + 
    geom_density() + 
    # stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))  
    facet_wrap(~key, scales = 'free',ncol=4)
}

Correlations

With Output Variable

if (eda == TRUE){
  #chart.Correlation(select(data,-JobName),  pch=21)
  # https://stackoverflow.com/questions/27034655/how-to-use-dplyrarrangedesc-when-using-a-string-as-column-name
  t=as.data.frame(round(cor(dplyr::select(data,-one_of(output.var.tr,'JobName'))
                            ,select_at(data,output.var.tr)),4))  %>%
    #rownames_to_column(var='variable') %>% filter(variable != !!output.var) %>% arrange(-y3.log)
    rownames_to_column(var='variable') %>% filter(variable != !!output.var) %>% arrange(-!!sym(output.var.tr))
  #DT::datatable(t)
  message("Top Positive")
  #kable(head(arrange(t,desc(y3.log)),20))
  kable(head(arrange(t,desc(!!sym(output.var.tr))),20))
  message("Top Negative")
  #kable(head(arrange(t,y3.log),20))
  kable(head(arrange(t,!!sym(output.var.tr)),20))
}
## Top Positive
## Top Negative
variable y3.cuberoot
x18sqinv -0.3542
x19byx18 -0.2383
x7sqinv -0.2055
stat110 -0.1568
x4 -0.0599
x16sqinv -0.0524
x9byx7 -0.0405
stat13 -0.0345
stat41 -0.0343
stat14 -0.0316
stat149 -0.0311
stat113 -0.0275
stat4 -0.0244
stat106 -0.0236
stat146 -0.0234
x8sqinv -0.0224
stat186 -0.0218
stat214 -0.0211
stat91 -0.0208
stat22 -0.0204

Between All Variables

if (eda == TRUE){
  #chart.Correlation(select(data,-JobName),  pch=21)
  t=as.data.frame(round(cor(dplyr::select(data,-one_of('JobName'))),4))
  #DT::datatable(t,options=list(scrollX=T))
  message("Showing only 10 variables")
  kable(t[1:10,1:10])
}
## Showing only 10 variables
x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
x1 1.0000 0.0034 -0.0028 0.0085 0.0068 0.0159 0.0264 -0.0012 0.0142 0.0013
x2 0.0034 1.0000 -0.0057 0.0004 -0.0094 -0.0101 0.0089 0.0078 0.0049 -0.0214
x3 -0.0028 -0.0057 1.0000 0.0029 0.0046 0.0006 -0.0105 -0.0002 0.0167 -0.0137
x4 0.0085 0.0004 0.0029 1.0000 -0.0059 0.0104 0.0098 0.0053 0.0061 -0.0023
x5 0.0068 -0.0094 0.0046 -0.0059 1.0000 0.0016 -0.0027 0.0081 0.0259 -0.0081
x6 0.0159 -0.0101 0.0006 0.0104 0.0016 1.0000 0.0200 -0.0157 0.0117 -0.0072
x7 0.0264 0.0089 -0.0105 0.0098 -0.0027 0.0200 1.0000 -0.0018 -0.0069 -0.0221
x8 -0.0012 0.0078 -0.0002 0.0053 0.0081 -0.0157 -0.0018 1.0000 0.0142 -0.0004
x9 0.0142 0.0049 0.0167 0.0061 0.0259 0.0117 -0.0069 0.0142 1.0000 0.0149
x10 0.0013 -0.0214 -0.0137 -0.0023 -0.0081 -0.0072 -0.0221 -0.0004 0.0149 1.0000

Scatter Plots with Output Variable

Scatter plots with all predictors and the output variable (y3.cuberoot)

if (eda == TRUE){
  d = gather(dplyr::select_at(data,c(predictors,output.var.tr)),key=target,value=value,-!!output.var.tr)
  ggplot(data=d, aes_string(x='value',y=output.var.tr)) + 
    geom_point(color='light blue',alpha=0.5) + 
    geom_smooth() + 
    facet_wrap(~target, scales = 'free',ncol=4)
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Multicollinearity - VIF

No Multicollinearity among predictors

Showing Top predictor by VIF Value

if (eda == TRUE){
  vifDF = usdm::vif(select_at(data,predictors)) %>% arrange(desc(VIF))
  head(vifDF,75)
}
##    Variables         VIF
## 1     x16log 4034.132490
## 2      x5log 2570.480236
## 3     x20log 1938.266861
## 4        x16 1858.952757
## 5        x11 1584.495455
## 6     x11log 1584.413764
## 7      x8log 1582.146975
## 8         x5 1189.237740
## 9        x20  902.575821
## 10        x8  744.289675
## 11     x7log  550.812309
## 12     x1log  531.107253
## 13  x16sqinv  462.792092
## 14    x13log  375.980264
## 15    x12log  370.400295
## 16    x18log  321.671898
## 17   x5sqinv  298.728516
## 18        x7  266.639612
## 19        x1  257.351308
## 20  x20sqinv  225.755989
## 21   x8sqinv  186.919980
## 22       x13  185.711703
## 23       x12  183.460153
## 24    x22log  175.757199
## 25       x18  159.785646
## 26       x22   89.143347
## 27   x7sqinv   66.833429
## 28   x1sqinv   64.365049
## 29       x21   58.186667
## 30        x2   48.203473
## 31  x13sqinv   46.729013
## 32    x21log   46.661585
## 33  x12sqinv   45.385536
## 34     x2log   42.112036
## 35  x18sqinv   41.396984
## 36       x23   38.305527
## 37    x23log   35.794212
## 38        x6   34.855249
## 39       x17   28.790620
## 40  x21byx20   23.359358
## 41  x22sqinv   23.122491
## 42     x6log   22.275514
## 43  x17byx16   21.860695
## 44    x6byx5   20.321670
## 45        x9   20.035371
## 46       x10   19.773782
## 47       x19   19.545113
## 48    x2byx1   17.394830
## 49       x14   17.277095
## 50   x10byx8   15.556597
## 51    x19log   14.852148
## 52     x9log   14.594729
## 53       x15   13.468616
## 54    x17log   12.810338
## 55    x14log   12.657383
## 56  x23byx22   12.160163
## 57    x9byx7   11.058388
## 58  x19byx18   10.649745
## 59  x14byx12    9.760691
## 60  x15byx13    9.255654
## 61    x15log    8.954102
## 62    x10log    8.856779
## 63   stat175    1.071200
## 64   stat141    1.070471
## 65    stat31    1.069392
## 66   stat171    1.069262
## 67    stat52    1.068932
## 68   stat186    1.068754
## 69     stat3    1.068417
## 70   stat134    1.068296
## 71   stat205    1.068047
## 72    stat20    1.067766
## 73        x3    1.067686
## 74   stat154    1.067546
## 75   stat152    1.067415

Feature Eng

  • Square Root transformation for x18
data.tr=data %>%
  mutate(x18.sqrt = sqrt(x18)) 
cols=c('x18','x18.sqrt')

Comparing Pre and Post Transformation Density Plots

# ggplot(gather(select_at(data.tr,cols)), aes(value)) + 
#   geom_histogram(aes(y=..density..),bins = 50,fill='light blue') + 
#   geom_density() + 
#   facet_wrap(~key, scales = 'free',ncol=4)

d = gather(dplyr::select_at(data.tr,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) + 
  geom_point(color='light blue',alpha=0.5) + 
  geom_smooth() + 
  facet_wrap(~target, scales = 'free',ncol=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#removing unwanted variables
data.tr=data.tr %>%
  #dplyr::select_at(names(data.tr)[! names(data.tr) %in% c('x18','y3','JobName')])
  dplyr::select_at(names(data.tr)[! names(data.tr) %in% c('JobName')])

data=data.tr
label.names=output.var.tr

Modeling

PCA

# 0 for no interaction, 
# 1 for Full 2 way interaction and 
# 2 for Selective 2 way interaction
# 3 for Selective 3 way interaction
InteractionMode = 2

pca.vars  = names(data)
pca.vars = pca.vars[!pca.vars %in% label.names]


# http://sshaikh.org/2015/05/06/parallelize-machine-learning-in-r-with-multi-core-cpus/
# #cl <- makeCluster(ceiling(detectCores()*0.5)) # use 75% of cores only, leave rest for other tasks
cl <- makeCluster(detectCores()*0.75) # use 75% of cores only, leave rest for other tasks
registerDoParallel(cl)

if(InteractionMode == 1){
  pca.formula =as.formula(paste0('~(',paste0(pca.vars, collapse ='+'),')^2'))
  pca.model = prcomp(formula=pca.formula,data=data[,pca.vars],center=T,scale.=T,retx = T)
  #saveRDS(pca.model,'pca.model.rds')
}
if (InteractionMode == 0){
  pca.model =  prcomp(x=data[,pca.vars],center=T,scale.=T,retx = T)
}
if (InteractionMode >= 2 & InteractionMode <= 3){
  controlled.vars = pca.vars[grep("^x",pca.vars)]
  stat.vars = pca.vars[grep("^stat",pca.vars)]
  
  if (InteractionMode >= 2){
    interaction.form = paste0('~(',paste0(controlled.vars, collapse ='+'),')^2')
  }
  if (InteractionMode >= 3){
    interaction.form = paste0('~(',paste0(controlled.vars, collapse ='+'),')^3')
  }
  no.interact.form = paste0(stat.vars, collapse ='+')
  
  pca.formula = as.formula(paste(interaction.form, no.interact.form, sep = "+"))
  pca.model = prcomp(formula=pca.formula,data=data[,pca.vars],center=T,scale.=T,retx = T)
}

stopCluster(cl)
registerDoSEQ() # register sequential engine in case you are not using this function anymore
targetCumVar = .9

pca.model$var = pca.model$sdev ^ 2 #eigenvalues
pca.model$pvar = pca.model$var / sum(pca.model$var)
pca.model$cumpvar = cumsum(pca.model$pvar )
pca.model$pcaSel = pca.model$cumpvar<=targetCumVar
pca.model$pcaSelCount = sum(pca.model$pcaSel)
pca.model$pcaSelTotVar = sum(pca.model$pvar[pca.model$pcaSel])
message(pca.model$pcaSelCount, " PCAs justify ",percent(targetCumVar)," of the total Variance. (",percent(pca.model$pcaSelTotVar),")")
## 164 PCAs justify 90.0% of the total Variance. (90.0%)
plot(pca.model$var,xlab="Principal component", ylab="Proportion of variance explained",   type='b')

plot(cumsum(pca.model$pvar ),xlab="Principal component", ylab="Cumulative Proportion of variance explained", ylim=c(0,1), type='b')

screeplot(pca.model,npcs = pca.model$pcaSelCount)

screeplot(pca.model,npcs = pca.model$pcaSelCount,type='lines')

#summary(pca.model)
#pca.model$rotation
#creating dataset
data.pca = dplyr::select(data,!!label.names) %>% 
  dplyr::bind_cols(dplyr::select(as.data.frame(pca.model$x)
                                 ,!!colnames(pca.model$rotation)[pca.model$pcaSel])
  )

Train Test Split

data.pca = data.pca[sample(nrow(data.pca)),] # randomly shuffle data
split = sample.split(data.pca[,label.names], SplitRatio = 0.8)

data.train = subset(data.pca, split == TRUE)
data.test = subset(data.pca, split == FALSE)

Common Functions

plot.diagnostics <-  function(model, train) {
  plot(model)
  
  residuals = resid(model) # Plotted above in plot(lm.out)
  r.standard = rstandard(model)
  r.student = rstudent(model)
  
  df = data.frame(x=predict(model,train),y=r.student)
  p=ggplot(data=df,aes(x=x,y=y)) +
    geom_point(color='blue',alpha=0.5,shape=20,size=2) +
    geom_hline(yintercept = 0,size=1)+
    ylab("Student Residuals") +
    xlab("Predicted Values")+
    ggtitle("Student Residual Plot")
  plot(p)
  
  df = data.frame(x=predict(model,train),y=r.standard)
  p=ggplot(data=df,aes(x=x,y=y)) +
    geom_point(color='blue',alpha=0.5,shape=20,size=2) +
    geom_hline(yintercept = c(-2,0,2),size=1)+
    ylab("Student Residuals") +
    xlab("Predicted Values")+
    ggtitle("Student Residual Plot")
  plot(p)
  # Histogram
  df=data.frame(r.student)
  p=ggplot(data=df,aes(r.student)) +
    geom_histogram(aes(y=..density..),bins = 50,fill='blue',alpha=0.6) + 
    stat_function(fun = dnorm, n = 100, args = list(mean = 0, sd = 1)) +
    ylab("Density")+
    xlab("Studentized Residuals")+
    ggtitle("Distribution of Studentized Residuals")
  plot(p)
  # http://www.stat.columbia.edu/~martin/W2024/R7.pdf
  # Influential plots
  inf.meas = influence.measures(model)
  # print (summary(inf.meas)) # too much data
  
  # Leverage plot
  lev = hat(model.matrix(model))
  df=tibble::rownames_to_column(as.data.frame(lev),'id')
  p=ggplot(data=df,aes(x=as.numeric(id),y=lev)) +
    geom_point(color='blue',alpha=0.5,shape=20,size=2) +
    ylab('Leverage - check') + 
    xlab('Index')
  plot(p)
  # Cook's Distance
  cd = cooks.distance(model)
  df=tibble::rownames_to_column(as.data.frame(cd),'id')
  p=ggplot(data=df,aes(x=as.numeric(id),y=cd)) +
    geom_point(color='blue',alpha=0.5,shape=20,size=2) +
    geom_text(data=filter(df,cd>15/nrow(train)),aes(label=id),check_overlap=T,size=3,vjust=-.5)+
    ylab('Cooks distances') + 
    geom_hline(yintercept = c(4/nrow(train),0),size=1)+
    xlab('Index')
  plot(p)
  print (paste("Number of data points that have Cook's D > 4/n: ", length(cd[cd > 4/nrow(train)]), sep = "")) 
  print (paste("Number of data points that have Cook's D > 1: ", length(cd[cd > 1]), sep = "")) 
  return(cd)
}

# function to set up random seeds
# Based on http://jaehyeon-kim.github.io/2015/05/Setup-Random-Seeds-on-Caret-Package.html 
setCaretSeeds <- function(method = "cv", numbers = 1, repeats = 1, tunes = NULL, seed = 1701) {
  #B is the number of resamples and integer vector of M (numbers + tune length if any)
  B <- if (method == "cv") numbers
  else if(method == "repeatedcv") numbers * repeats
  else NULL
  if(is.null(length)) {
    seeds <- NULL
  } else {
    set.seed(seed = seed)
    seeds <- vector(mode = "list", length = B)
    seeds <- lapply(seeds, function(x) sample.int(n = 1000000
                                                  , size = numbers + ifelse(is.null(tunes), 0, tunes)))
    seeds[[length(seeds) + 1]] <- sample.int(n = 1000000, size = 1)
  }
  # return seeds
  seeds
}



train.caret.glmselect = function(formula, data, method
                                 ,subopt = NULL, feature.names
                                 , train.control = NULL, tune.grid = NULL, pre.proc = NULL){
  
  if(is.null(train.control)){
    train.control <- trainControl(method = "cv"
                              ,number = 10
                              ,seeds = setCaretSeeds(method = "cv"
                                                     , numbers = 10
                                                     , seed = 1701)
                              ,search = "grid"
                              ,verboseIter = TRUE
                              ,allowParallel = TRUE
                              )
  }
  
  if(is.null(tune.grid)){
    if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
      tune.grid = data.frame(nvmax = 1:length(feature.names))
    }
    if (method == 'glmnet' && subopt == 'LASSO'){
      # Will only show 1 Lambda value during training, but that is OK
      # https://stackoverflow.com/questions/47526544/why-need-to-tune-lambda-with-carettrain-method-glmnet-and-cv-glmnet
      # Another option for LASSO is this: https://github.com/topepo/caret/blob/master/RegressionTests/Code/lasso.R
      lambda = 10^seq(-2,0, length =100)
      alpha = c(1)
      tune.grid = expand.grid(alpha = alpha,lambda = lambda)
    }
    if (method == 'lars'){
      # https://github.com/topepo/caret/blob/master/RegressionTests/Code/lars.R
      fraction = seq(0, 1, length = 100)
      tune.grid = expand.grid(fraction = fraction)
      pre.proc = c("center", "scale") 
    }
  }
  
  # http://sshaikh.org/2015/05/06/parallelize-machine-learning-in-r-with-multi-core-cpus/
  # #cl <- makeCluster(ceiling(detectCores()*0.5)) # use 75% of cores only, leave rest for other tasks
  cl <- makeCluster(detectCores()*0.75) # use 75% of cores only, leave rest for other tasks
  registerDoParallel(cl)

  set.seed(1) 
  # note that the seed has to actually be set just before this function is called
  # settign is above just not ensure reproducibility for some reason
  model.caret <- caret::train(formula
                              , data = data
                              , method = method
                              , tuneGrid = tune.grid
                              , trControl = train.control
                              , preProc = pre.proc
                              )
  
  stopCluster(cl)
  registerDoSEQ() # register sequential engine in case you are not using this function anymore
  
  if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
    print("All models results")
    print(model.caret$results) # all model results
    print("Best Model")
    print(model.caret$bestTune) # best model
    model = model.caret$finalModel

    # Metrics Plot 
    dataPlot = model.caret$results %>%
      gather(key='metric',value='value',-nvmax) %>%
      dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
    metricsPlot = ggplot(data=dataPlot,aes(x=nvmax,y=value) ) +
      geom_line(color='lightblue4') +
      geom_point(color='blue',alpha=0.7,size=.9) +
      facet_wrap(~metric,ncol=2,scales='free_y')+
      theme_light()
    plot(metricsPlot)
    
    # Residuals Plot
    # leap function does not support studentized residuals
    dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
    residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
      geom_point(color='light blue',alpha=0.7) +
      geom_smooth(method="lm")+
      theme_light()
    plot(residPlot)
   
    residHistogram = ggplot(dataPlot,aes(x=res)) +
      geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
      #geom_density(color='lightblue4') + 
      stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
                                                       , sd = sd(dataPlot$res)),color='lightblue4')  
      theme_light()
    plot(residHistogram)
    id = rownames(model.caret$bestTune)    
    # Provides the coefficients of the best model
    # regsubsets doens return a full model (see documentation of regsubset), so we need to recalcualte themodel
    # https://stackoverflow.com/questions/13063762/how-to-obtain-a-lm-object-from-regsubsets
    print("Coefficients of final model:")
    coefs <- coef(model, id=id)
    #calculate the model to the the coef intervals
    nams <- names(coefs)
    nams <- nams[!nams %in% "(Intercept)"]
    response <-  as.character(formula[[2]])
    form <- as.formula(paste(response, paste(nams, collapse = " + "), sep = " ~ "))
    mod <- lm(form, data = data)
    #coefs
    #coef(mod)
    print(car::Confint(mod))
    return(list(model = model,id = id, residPlot = residPlot, residHistogram=residHistogram
                ,modelLM=mod))
  }
  if (method == 'glmnet' && subopt == 'LASSO'){
    print(model.caret)
    print(plot(model.caret))
    print(model.caret$bestTune)
    
    print(model.caret$results)
    model=model.caret$finalModel
    # Metrics Plot 
    dataPlot = model.caret$results %>%
      gather(key='metric',value='value',-lambda) %>%
      dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
    metricsPlot = ggplot(data=dataPlot,aes(x=lambda,y=value) ) +
      geom_line(color='lightblue4') +
      geom_point(color='blue',alpha=0.7,size=.9) +
      facet_wrap(~metric,ncol=2,scales='free_y')+
      theme_light()
    plot(metricsPlot)
    
    # Residuals Plot 
    dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
    residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
      geom_point(color='light blue',alpha=0.7) +
      geom_smooth(method="lm")+
      theme_light()
    plot(residPlot)

    residHistogram = ggplot(dataPlot,aes(x=res)) +
      geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
      #geom_density(color='lightblue4') +
      stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
                                                       , sd = sd(dataPlot$res)),color='lightblue4')  
      theme_light()
    plot(residHistogram)
    
    print("Coefficients") 
    #no interval for glmnet: https://stackoverflow.com/questions/39750965/confidence-intervals-for-ridge-regression
    t=coef(model,s=model.caret$bestTune$lambda)
    model.coef = t[which(t[,1]!=0),]
    print(as.data.frame(model.coef))
    id = NULL # not really needed but added for consistency
    return(list(model = model.caret,id = id, residPlot = residPlot, metricsPlot=metricsPlot ))
  }
  if (method == 'lars'){
    print(model.caret)
    print(plot(model.caret))
    print(model.caret$bestTune)
    
    # Metrics Plot
    dataPlot = model.caret$results %>%
        gather(key='metric',value='value',-fraction) %>%
      dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
    metricsPlot = ggplot(data=dataPlot,aes(x=fraction,y=value) ) +
      geom_line(color='lightblue4') +
      geom_point(color='blue',alpha=0.7,size=.9) +
      facet_wrap(~metric,ncol=2,scales='free_y')+
      theme_light()
    plot(metricsPlot)
    
    # Residuals Plot
    dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
    residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
      geom_point(color='light blue',alpha=0.7) +
      geom_smooth(method="lm")+
      theme_light()
    plot(residPlot)

    residHistogram = ggplot(dataPlot,aes(x=res)) +
      geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
      #geom_density(color='lightblue4') + 
      stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
                                                       , sd = sd(dataPlot$res)),color='lightblue4')  
      theme_light()
    plot(residHistogram)
    
    print("Coefficients") 
    t=coef(model.caret$finalModel,s=model.caret$bestTune$fraction,mode='fraction')
    model.coef = t[which(t!=0)]
    print(model.coef)
    id = NULL # not really needed but added for consistency
    return(list(model = model.caret,id = id, residPlot = residPlot, residHistogram=residHistogram))
  }
}

# https://stackoverflow.com/questions/48265743/linear-model-subset-selection-goodness-of-fit-with-k-fold-cross-validation
# changed slightly since call[[2]] was just returning "formula" without actually returnign the value in formula
predict.regsubsets <- function(object, newdata, id, formula, ...) {
    #form <- as.formula(object$call[[2]])
    mat <- model.matrix(formula, newdata) # adds intercept and expands any interaction terms
    coefi <- coef(object, id = id)
    xvars <- names(coefi)
    return(mat[,xvars]%*%coefi)
}
  
test.model = function(model, test, level=0.95
                      ,draw.limits = FALSE, good = 0.1, ok = 0.15
                      ,method = NULL, subopt = NULL
                      ,id = NULL, formula, feature.names, label.names
                      ,transformation = NULL){
  ## if using caret for glm select equivalent functionality, 
  ## need to pass formula (full is ok as it will select subset of variables from there)
  if (is.null(method)){
    pred = predict(model, newdata=test, interval="confidence", level = level) 
  }
  
  if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
    pred = predict.regsubsets(model, newdata = test, id = id, formula = formula)
  }
  
  if (method == 'glmnet' && subopt == 'LASSO'){
    xtest = as.matrix(test[,feature.names]) 
    pred=as.data.frame(predict(model, xtest))
  }
  
  if (method == 'lars'){
    pred=as.data.frame(predict(model, newdata = test))
  }
    
  # Summary of predicted values
  print ("Summary of predicted values: ")
  print(summary(pred[,1]))

  test.mse = mean((test[,label.names]-pred[,1])^2)
  print (paste(method, subopt, "Test MSE:", test.mse, sep=" "))
  
  test.rmse = sqrt(test.mse)
  print (paste(method, subopt, "Test RMSE:", test.rmse, sep=" "))
  
  if(log.pred == TRUE || norm.pred == TRUE){
    # plot transformewd comparison first
    df=data.frame(x=test[,label.names],y=pred[,1])
    ggplot(df,aes(x=x,y=y)) +
      geom_point(color='blue',alpha=0.5,shape=20,size=2) +
      geom_abline(slope=1,intercept=0,color='black',size=1) +
      #scale_y_continuous(limits=c(min(df),max(df)))+
      xlab("Actual (Transformed)")+
      ylab("Predicted (Transformed)")
  }
    
  if (log.pred == FALSE && norm.pred == FALSE){
    x = test[,label.names]
    y = pred[,1]
  }
  if (log.pred == TRUE){
    x = 10^test[,label.names]
    y = 10^pred[,1]  
  }
  if (norm.pred == TRUE){
    x = predict(transformation, test[,label.names], inverse = TRUE)
    y = predict(transformation, pred[,1], inverse = TRUE)
  }

  df=data.frame(x,y)
  ggplot(df,aes(x,y)) +
    geom_point(color='blue',alpha=0.5,shape=20,size=2) +
    geom_abline(slope=c(1+good,1-good,1+ok,1-ok)
                ,intercept=rep(0,4),color=c('dark green','dark green','dark red','dark red'),size=1,alpha=0.8) +
    #scale_y_continuous(limits=c(min(df),max(df)))+
    xlab("Actual")+
    ylab("Predicted") 
    
 
}

Setup Formulae

n <- names(data.train)
 formula <- as.formula(paste(paste(n[n %in% label.names], collapse = " + ")
                             ," ~", paste(n[!n %in% label.names], collapse = " + "))) 

grand.mean.formula = as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~ 1"))

print(formula)
## y3.cuberoot ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + 
##     PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + 
##     PC18 + PC19 + PC20 + PC21 + PC22 + PC23 + PC24 + PC25 + PC26 + 
##     PC27 + PC28 + PC29 + PC30 + PC31 + PC32 + PC33 + PC34 + PC35 + 
##     PC36 + PC37 + PC38 + PC39 + PC40 + PC41 + PC42 + PC43 + PC44 + 
##     PC45 + PC46 + PC47 + PC48 + PC49 + PC50 + PC51 + PC52 + PC53 + 
##     PC54 + PC55 + PC56 + PC57 + PC58 + PC59 + PC60 + PC61 + PC62 + 
##     PC63 + PC64 + PC65 + PC66 + PC67 + PC68 + PC69 + PC70 + PC71 + 
##     PC72 + PC73 + PC74 + PC75 + PC76 + PC77 + PC78 + PC79 + PC80 + 
##     PC81 + PC82 + PC83 + PC84 + PC85 + PC86 + PC87 + PC88 + PC89 + 
##     PC90 + PC91 + PC92 + PC93 + PC94 + PC95 + PC96 + PC97 + PC98 + 
##     PC99 + PC100 + PC101 + PC102 + PC103 + PC104 + PC105 + PC106 + 
##     PC107 + PC108 + PC109 + PC110 + PC111 + PC112 + PC113 + PC114 + 
##     PC115 + PC116 + PC117 + PC118 + PC119 + PC120 + PC121 + PC122 + 
##     PC123 + PC124 + PC125 + PC126 + PC127 + PC128 + PC129 + PC130 + 
##     PC131 + PC132 + PC133 + PC134 + PC135 + PC136 + PC137 + PC138 + 
##     PC139 + PC140 + PC141 + PC142 + PC143 + PC144 + PC145 + PC146 + 
##     PC147 + PC148 + PC149 + PC150 + PC151 + PC152 + PC153 + PC154 + 
##     PC155 + PC156 + PC157 + PC158 + PC159 + PC160 + PC161 + PC162 + 
##     PC163 + PC164
print(grand.mean.formula)
## y3.cuberoot ~ 1
# Update feature.names because we may have transformed some features
feature.names = n[!n %in% label.names]

Full Model

model.full = lm(formula , data.train)
summary(model.full)
## 
## Call:
## lm(formula = formula, data = data.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24071 -0.08500 -0.02137  0.06450  0.75356 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  5.001e+00  1.634e-03 3060.210  < 2e-16 ***
## PC1         -1.769e-03  1.432e-04  -12.354  < 2e-16 ***
## PC2         -3.528e-03  1.448e-04  -24.363  < 2e-16 ***
## PC3         -1.589e-03  1.445e-04  -10.997  < 2e-16 ***
## PC4         -1.317e-03  1.483e-04   -8.882  < 2e-16 ***
## PC5          6.885e-04  1.511e-04    4.557 5.31e-06 ***
## PC6         -3.035e-04  1.526e-04   -1.988 0.046832 *  
## PC7         -8.211e-04  1.572e-04   -5.222 1.83e-07 ***
## PC8         -7.127e-05  1.582e-04   -0.451 0.652315    
## PC9         -1.850e-04  1.636e-04   -1.131 0.258291    
## PC10        -2.481e-05  1.639e-04   -0.151 0.879704    
## PC11        -1.966e-03  1.755e-04  -11.206  < 2e-16 ***
## PC12        -1.801e-03  1.873e-04   -9.617  < 2e-16 ***
## PC13         1.217e-03  1.909e-04    6.377 1.96e-10 ***
## PC14         9.326e-04  1.952e-04    4.778 1.82e-06 ***
## PC15         2.820e-05  1.999e-04    0.141 0.887821    
## PC16         1.211e-03  2.034e-04    5.955 2.77e-09 ***
## PC17        -9.325e-04  2.133e-04   -4.372 1.26e-05 ***
## PC18        -1.353e-03  2.221e-04   -6.090 1.21e-09 ***
## PC19         2.090e-04  2.238e-04    0.934 0.350297    
## PC20         1.595e-03  2.448e-04    6.516 7.85e-11 ***
## PC21         2.785e-04  2.550e-04    1.092 0.274863    
## PC22         4.189e-04  4.002e-04    1.047 0.295300    
## PC23         1.125e-03  4.940e-04    2.277 0.022824 *  
## PC24        -3.346e-03  5.772e-04   -5.798 7.11e-09 ***
## PC25         6.088e-04  6.454e-04    0.943 0.345549    
## PC26         1.905e-03  6.643e-04    2.867 0.004157 ** 
## PC27         2.634e-04  6.691e-04    0.394 0.693867    
## PC28        -2.005e-04  6.796e-04   -0.295 0.767989    
## PC29         1.865e-03  7.420e-04    2.513 0.011994 *  
## PC30         9.734e-05  7.654e-04    0.127 0.898809    
## PC31        -6.009e-04  8.143e-04   -0.738 0.460577    
## PC32        -2.744e-03  8.313e-04   -3.301 0.000971 ***
## PC33         1.795e-03  8.517e-04    2.108 0.035113 *  
## PC34         4.692e-03  8.904e-04    5.270 1.42e-07 ***
## PC35         1.338e-04  9.487e-04    0.141 0.887819    
## PC36         9.710e-05  9.729e-04    0.100 0.920507    
## PC37        -1.538e-03  9.922e-04   -1.550 0.121253    
## PC38         1.395e-03  1.034e-03    1.348 0.177658    
## PC39        -3.660e-04  1.073e-03   -0.341 0.733093    
## PC40        -1.263e-04  1.061e-03   -0.119 0.905301    
## PC41        -1.238e-03  1.095e-03   -1.131 0.258313    
## PC42        -9.786e-05  1.090e-03   -0.090 0.928448    
## PC43         1.012e-03  1.103e-03    0.918 0.358861    
## PC44         1.332e-03  1.119e-03    1.191 0.233850    
## PC45        -2.434e-04  1.128e-03   -0.216 0.829243    
## PC46         1.731e-04  1.130e-03    0.153 0.878229    
## PC47        -1.018e-03  1.127e-03   -0.904 0.366193    
## PC48        -5.885e-06  1.152e-03   -0.005 0.995923    
## PC49         1.013e-03  1.168e-03    0.868 0.385506    
## PC50        -1.203e-03  1.180e-03   -1.019 0.308119    
## PC51         1.216e-03  1.175e-03    1.035 0.300726    
## PC52        -2.183e-04  1.175e-03   -0.186 0.852598    
## PC53         1.227e-03  1.196e-03    1.026 0.304788    
## PC54         1.399e-04  1.195e-03    0.117 0.906761    
## PC55        -2.050e-04  1.185e-03   -0.173 0.862669    
## PC56         5.118e-04  1.211e-03    0.423 0.672644    
## PC57        -6.983e-04  1.222e-03   -0.571 0.567719    
## PC58         2.486e-04  1.207e-03    0.206 0.836856    
## PC59         3.778e-03  1.236e-03    3.057 0.002246 ** 
## PC60         7.741e-05  1.223e-03    0.063 0.949528    
## PC61         6.061e-04  1.230e-03    0.493 0.622201    
## PC62        -5.388e-04  1.242e-03   -0.434 0.664434    
## PC63        -2.670e-03  1.241e-03   -2.152 0.031433 *  
## PC64        -2.749e-03  1.258e-03   -2.185 0.028905 *  
## PC65        -1.891e-03  1.250e-03   -1.514 0.130205    
## PC66        -3.489e-04  1.266e-03   -0.276 0.782823    
## PC67         6.590e-04  1.272e-03    0.518 0.604313    
## PC68         3.001e-03  1.261e-03    2.380 0.017363 *  
## PC69         2.605e-03  1.285e-03    2.027 0.042664 *  
## PC70         1.050e-04  1.276e-03    0.082 0.934432    
## PC71         3.943e-03  1.280e-03    3.080 0.002081 ** 
## PC72        -4.817e-04  1.291e-03   -0.373 0.709013    
## PC73         2.075e-03  1.282e-03    1.619 0.105492    
## PC74        -2.625e-03  1.301e-03   -2.018 0.043661 *  
## PC75        -4.592e-03  1.313e-03   -3.497 0.000475 ***
## PC76        -1.069e-03  1.298e-03   -0.824 0.410259    
## PC77         2.136e-03  1.314e-03    1.625 0.104126    
## PC78         1.417e-03  1.317e-03    1.076 0.281913    
## PC79         9.617e-04  1.344e-03    0.716 0.474321    
## PC80        -1.566e-03  1.338e-03   -1.170 0.241857    
## PC81         4.110e-03  1.333e-03    3.083 0.002063 ** 
## PC82         8.432e-04  1.340e-03    0.629 0.529258    
## PC83        -3.242e-03  1.361e-03   -2.382 0.017245 *  
## PC84         2.766e-03  1.347e-03    2.053 0.040119 *  
## PC85         4.390e-03  1.349e-03    3.255 0.001141 ** 
## PC86        -1.353e-03  1.378e-03   -0.982 0.326302    
## PC87         6.301e-03  1.364e-03    4.620 3.92e-06 ***
## PC88        -3.645e-03  1.380e-03   -2.642 0.008262 ** 
## PC89        -3.915e-03  1.383e-03   -2.831 0.004654 ** 
## PC90        -3.854e-03  1.382e-03   -2.789 0.005299 ** 
## PC91         1.251e-04  1.388e-03    0.090 0.928226    
## PC92        -1.570e-03  1.395e-03   -1.125 0.260470    
## PC93        -5.045e-04  1.399e-03   -0.361 0.718342    
## PC94        -2.607e-03  1.403e-03   -1.858 0.063210 .  
## PC95        -6.749e-04  1.408e-03   -0.479 0.631687    
## PC96        -2.102e-03  1.415e-03   -1.485 0.137470    
## PC97        -2.935e-03  1.406e-03   -2.088 0.036840 *  
## PC98        -1.552e-03  1.411e-03   -1.100 0.271588    
## PC99        -1.787e-03  1.412e-03   -1.265 0.205894    
## PC100        3.199e-05  1.429e-03    0.022 0.982138    
## PC101        2.161e-04  1.415e-03    0.153 0.878667    
## PC102       -1.947e-03  1.424e-03   -1.367 0.171689    
## PC103        1.809e-03  1.417e-03    1.277 0.201713    
## PC104       -3.187e-03  1.421e-03   -2.242 0.025004 *  
## PC105        2.941e-03  1.433e-03    2.052 0.040232 *  
## PC106        3.598e-03  1.423e-03    2.528 0.011505 *  
## PC107        1.233e-03  1.441e-03    0.856 0.392189    
## PC108        1.006e-03  1.440e-03    0.698 0.485100    
## PC109        5.538e-04  1.446e-03    0.383 0.701820    
## PC110        8.325e-06  1.441e-03    0.006 0.995391    
## PC111       -9.561e-04  1.443e-03   -0.663 0.507657    
## PC112        7.693e-06  1.447e-03    0.005 0.995758    
## PC113        6.906e-04  1.449e-03    0.476 0.633762    
## PC114       -2.121e-03  1.446e-03   -1.466 0.142591    
## PC115       -6.693e-03  1.446e-03   -4.628 3.77e-06 ***
## PC116        9.847e-06  1.451e-03    0.007 0.994586    
## PC117       -1.612e-03  1.443e-03   -1.117 0.264007    
## PC118        2.800e-03  1.459e-03    1.920 0.054921 .  
## PC119       -2.613e-03  1.451e-03   -1.801 0.071811 .  
## PC120        1.611e-03  1.470e-03    1.096 0.273094    
## PC121       -6.055e-04  1.458e-03   -0.415 0.677928    
## PC122        2.197e-03  1.471e-03    1.494 0.135348    
## PC123       -3.670e-03  1.470e-03   -2.496 0.012577 *  
## PC124        7.619e-04  1.471e-03    0.518 0.604630    
## PC125        2.252e-03  1.484e-03    1.518 0.129109    
## PC126        1.042e-03  1.483e-03    0.703 0.482195    
## PC127        1.860e-03  1.481e-03    1.255 0.209464    
## PC128       -2.742e-03  1.485e-03   -1.847 0.064821 .  
## PC129        4.541e-04  1.485e-03    0.306 0.759691    
## PC130        2.831e-03  1.488e-03    1.902 0.057168 .  
## PC131       -3.091e-03  1.486e-03   -2.080 0.037550 *  
## PC132        1.860e-03  1.494e-03    1.245 0.213174    
## PC133       -6.502e-04  1.492e-03   -0.436 0.663105    
## PC134        4.060e-03  1.488e-03    2.729 0.006379 ** 
## PC135        1.503e-03  1.492e-03    1.008 0.313681    
## PC136        1.602e-03  1.499e-03    1.069 0.285328    
## PC137       -2.137e-03  1.499e-03   -1.425 0.154210    
## PC138        2.367e-03  1.507e-03    1.571 0.116196    
## PC139       -2.167e-03  1.495e-03   -1.450 0.147252    
## PC140       -1.140e-03  1.511e-03   -0.754 0.450638    
## PC141       -5.073e-04  1.504e-03   -0.337 0.735937    
## PC142        1.712e-04  1.499e-03    0.114 0.909116    
## PC143        1.363e-03  1.509e-03    0.904 0.366293    
## PC144        2.361e-03  1.509e-03    1.565 0.117599    
## PC145        8.391e-04  1.513e-03    0.554 0.579323    
## PC146        3.561e-03  1.510e-03    2.358 0.018408 *  
## PC147       -9.419e-04  1.518e-03   -0.620 0.535114    
## PC148       -1.712e-03  1.515e-03   -1.130 0.258360    
## PC149       -6.737e-04  1.523e-03   -0.442 0.658320    
## PC150       -1.624e-04  1.513e-03   -0.107 0.914494    
## PC151        2.331e-03  1.525e-03    1.528 0.126575    
## PC152       -1.454e-03  1.524e-03   -0.954 0.340296    
## PC153        2.923e-03  1.533e-03    1.907 0.056545 .  
## PC154       -2.706e-03  1.517e-03   -1.784 0.074423 .  
## PC155        2.600e-03  1.551e-03    1.676 0.093724 .  
## PC156        3.074e-03  1.541e-03    1.995 0.046079 *  
## PC157       -5.720e-04  1.535e-03   -0.373 0.709474    
## PC158       -3.334e-04  1.541e-03   -0.216 0.828745    
## PC159        6.423e-03  1.541e-03    4.167 3.13e-05 ***
## PC160       -4.750e-04  1.537e-03   -0.309 0.757201    
## PC161        1.130e-03  1.543e-03    0.732 0.464168    
## PC162       -4.541e-03  1.540e-03   -2.948 0.003211 ** 
## PC163        3.089e-03  1.538e-03    2.008 0.044663 *  
## PC164        3.990e-04  1.540e-03    0.259 0.795594    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1217 on 5419 degrees of freedom
## Multiple R-squared:  0.2553, Adjusted R-squared:  0.2327 
## F-statistic: 11.33 on 164 and 5419 DF,  p-value: < 2.2e-16
cd.full = plot.diagnostics(model=model.full, train=data.train)

## [1] "Number of data points that have Cook's D > 4/n: 267"
## [1] "Number of data points that have Cook's D > 1: 0"

Checking with removal of high influence points

high.cd = names(cd.full[cd.full > 4/nrow(data.train)])

#save dataset with high.cd flagged
t = data.train %>% 
  rownames_to_column() %>%
  mutate(high.cd = ifelse(rowname %in% high.cd,1,0))
#write.csv(t,file='data_high_cd_flag.csv',row.names = F)
###
data.train2 = data.train[!(rownames(data.train)) %in% high.cd,]
model.full2 = lm(formula , data.train2)
summary(model.full2)
## 
## Call:
## lm(formula = formula, data = data.train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.21613 -0.07416 -0.01461  0.06462  0.31527 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  4.987e+00  1.355e-03 3679.358  < 2e-16 ***
## PC1         -1.847e-03  1.217e-04  -15.181  < 2e-16 ***
## PC2         -3.441e-03  1.206e-04  -28.524  < 2e-16 ***
## PC3         -1.645e-03  1.209e-04  -13.606  < 2e-16 ***
## PC4         -1.384e-03  1.232e-04  -11.233  < 2e-16 ***
## PC5          6.475e-04  1.264e-04    5.122 3.13e-07 ***
## PC6         -2.078e-04  1.269e-04   -1.637 0.101718    
## PC7         -7.891e-04  1.307e-04   -6.035 1.70e-09 ***
## PC8         -6.656e-05  1.320e-04   -0.504 0.614257    
## PC9         -2.479e-05  1.361e-04   -0.182 0.855537    
## PC10        -8.504e-06  1.364e-04   -0.062 0.950279    
## PC11        -2.137e-03  1.455e-04  -14.691  < 2e-16 ***
## PC12        -1.835e-03  1.548e-04  -11.850  < 2e-16 ***
## PC13         1.207e-03  1.591e-04    7.586 3.90e-14 ***
## PC14         8.573e-04  1.618e-04    5.298 1.22e-07 ***
## PC15        -1.063e-04  1.663e-04   -0.639 0.522625    
## PC16         1.139e-03  1.688e-04    6.752 1.62e-11 ***
## PC17        -8.718e-04  1.765e-04   -4.941 8.03e-07 ***
## PC18        -1.345e-03  1.836e-04   -7.326 2.73e-13 ***
## PC19         2.692e-04  1.857e-04    1.450 0.147189    
## PC20         1.683e-03  2.031e-04    8.284  < 2e-16 ***
## PC21         3.339e-04  2.115e-04    1.578 0.114522    
## PC22         6.863e-04  3.315e-04    2.070 0.038481 *  
## PC23         1.298e-03  4.181e-04    3.104 0.001922 ** 
## PC24        -3.672e-03  4.823e-04   -7.614 3.14e-14 ***
## PC25         8.855e-04  5.395e-04    1.641 0.100762    
## PC26         1.121e-03  5.568e-04    2.013 0.044159 *  
## PC27        -2.506e-04  5.586e-04   -0.449 0.653701    
## PC28        -1.912e-04  5.671e-04   -0.337 0.736043    
## PC29         1.610e-03  6.171e-04    2.609 0.009111 ** 
## PC30         2.626e-04  6.437e-04    0.408 0.683345    
## PC31        -6.751e-04  6.803e-04   -0.992 0.321059    
## PC32        -3.039e-03  6.888e-04   -4.412 1.05e-05 ***
## PC33         9.568e-05  7.143e-04    0.134 0.893447    
## PC34         4.286e-03  7.391e-04    5.800 7.04e-09 ***
## PC35         3.975e-04  7.977e-04    0.498 0.618295    
## PC36        -5.279e-04  8.128e-04   -0.649 0.516064    
## PC37        -1.689e-03  8.234e-04   -2.051 0.040276 *  
## PC38         1.436e-03  8.605e-04    1.669 0.095242 .  
## PC39        -5.513e-04  9.312e-04   -0.592 0.553876    
## PC40        -1.269e-03  8.955e-04   -1.417 0.156467    
## PC41        -1.128e-03  9.131e-04   -1.236 0.216622    
## PC42         8.730e-04  9.191e-04    0.950 0.342222    
## PC43         1.740e-03  9.341e-04    1.863 0.062507 .  
## PC44         9.750e-04  9.596e-04    1.016 0.309636    
## PC45        -2.544e-05  9.545e-04   -0.027 0.978738    
## PC46         5.420e-04  9.513e-04    0.570 0.568856    
## PC47        -8.560e-04  9.485e-04   -0.902 0.366852    
## PC48        -1.791e-05  9.633e-04   -0.019 0.985167    
## PC49         1.410e-03  9.822e-04    1.435 0.151315    
## PC50        -1.606e-03  9.934e-04   -1.617 0.105954    
## PC51         1.239e-03  9.971e-04    1.242 0.214194    
## PC52         2.172e-04  9.880e-04    0.220 0.825973    
## PC53         1.937e-03  1.002e-03    1.933 0.053294 .  
## PC54        -3.993e-04  1.007e-03   -0.396 0.691788    
## PC55        -1.493e-03  9.999e-04   -1.493 0.135477    
## PC56         5.237e-04  1.029e-03    0.509 0.610722    
## PC57        -9.999e-04  1.020e-03   -0.981 0.326767    
## PC58        -9.614e-04  1.017e-03   -0.945 0.344544    
## PC59         3.580e-03  1.038e-03    3.450 0.000564 ***
## PC60        -1.017e-03  1.029e-03   -0.989 0.322749    
## PC61         3.521e-04  1.030e-03    0.342 0.732487    
## PC62         8.905e-04  1.046e-03    0.851 0.394771    
## PC63        -2.374e-03  1.044e-03   -2.274 0.023003 *  
## PC64        -3.043e-03  1.056e-03   -2.881 0.003977 ** 
## PC65        -1.642e-03  1.053e-03   -1.560 0.118793    
## PC66        -1.802e-04  1.066e-03   -0.169 0.865722    
## PC67         4.964e-04  1.069e-03    0.464 0.642428    
## PC68         2.918e-03  1.063e-03    2.745 0.006076 ** 
## PC69         3.159e-03  1.081e-03    2.923 0.003479 ** 
## PC70         9.358e-04  1.060e-03    0.883 0.377197    
## PC71         2.390e-03  1.070e-03    2.232 0.025640 *  
## PC72        -1.085e-03  1.080e-03   -1.005 0.314831    
## PC73         1.730e-03  1.070e-03    1.616 0.106170    
## PC74        -1.321e-03  1.090e-03   -1.212 0.225490    
## PC75        -2.974e-03  1.097e-03   -2.712 0.006701 ** 
## PC76        -1.783e-03  1.077e-03   -1.655 0.097948 .  
## PC77         1.944e-03  1.100e-03    1.767 0.077225 .  
## PC78         6.099e-04  1.099e-03    0.555 0.578895    
## PC79         1.348e-03  1.125e-03    1.199 0.230662    
## PC80        -1.049e-03  1.110e-03   -0.945 0.344595    
## PC81         4.484e-03  1.111e-03    4.034 5.55e-05 ***
## PC82         7.702e-04  1.117e-03    0.690 0.490533    
## PC83        -2.815e-03  1.137e-03   -2.475 0.013358 *  
## PC84         3.100e-03  1.127e-03    2.750 0.005976 ** 
## PC85         4.212e-03  1.130e-03    3.729 0.000194 ***
## PC86        -5.511e-04  1.148e-03   -0.480 0.631183    
## PC87         6.362e-03  1.134e-03    5.609 2.14e-08 ***
## PC88        -3.203e-03  1.151e-03   -2.783 0.005411 ** 
## PC89        -3.044e-03  1.153e-03   -2.641 0.008302 ** 
## PC90        -3.127e-03  1.155e-03   -2.708 0.006794 ** 
## PC91        -7.365e-04  1.154e-03   -0.638 0.523290    
## PC92        -1.094e-03  1.157e-03   -0.946 0.344442    
## PC93        -1.758e-03  1.164e-03   -1.511 0.130826    
## PC94        -1.826e-03  1.163e-03   -1.570 0.116375    
## PC95        -2.194e-04  1.174e-03   -0.187 0.851802    
## PC96        -1.808e-03  1.176e-03   -1.537 0.124246    
## PC97        -2.241e-03  1.170e-03   -1.916 0.055412 .  
## PC98        -2.276e-03  1.170e-03   -1.945 0.051842 .  
## PC99        -8.725e-04  1.175e-03   -0.743 0.457611    
## PC100        1.953e-04  1.184e-03    0.165 0.868964    
## PC101       -1.279e-03  1.178e-03   -1.086 0.277458    
## PC102       -1.582e-03  1.185e-03   -1.335 0.182000    
## PC103        8.412e-04  1.181e-03    0.712 0.476328    
## PC104       -3.076e-03  1.175e-03   -2.618 0.008879 ** 
## PC105        3.219e-03  1.194e-03    2.695 0.007052 ** 
## PC106        2.640e-03  1.182e-03    2.234 0.025548 *  
## PC107        8.720e-04  1.200e-03    0.727 0.467485    
## PC108       -8.127e-05  1.193e-03   -0.068 0.945695    
## PC109        7.600e-04  1.202e-03    0.632 0.527244    
## PC110        3.128e-04  1.198e-03    0.261 0.793944    
## PC111       -1.984e-03  1.201e-03   -1.653 0.098466 .  
## PC112       -1.514e-04  1.202e-03   -0.126 0.899810    
## PC113       -4.512e-05  1.203e-03   -0.037 0.970092    
## PC114       -2.379e-03  1.205e-03   -1.974 0.048461 *  
## PC115       -6.637e-03  1.204e-03   -5.515 3.67e-08 ***
## PC116        6.952e-04  1.206e-03    0.576 0.564387    
## PC117       -8.629e-04  1.202e-03   -0.718 0.472893    
## PC118        1.259e-03  1.212e-03    1.039 0.298770    
## PC119       -2.808e-03  1.207e-03   -2.326 0.020038 *  
## PC120        1.291e-03  1.221e-03    1.058 0.290200    
## PC121       -1.146e-03  1.208e-03   -0.949 0.342675    
## PC122        2.131e-03  1.222e-03    1.744 0.081263 .  
## PC123       -3.102e-03  1.219e-03   -2.546 0.010931 *  
## PC124       -3.855e-04  1.220e-03   -0.316 0.752025    
## PC125        2.886e-03  1.229e-03    2.348 0.018935 *  
## PC126       -2.597e-05  1.230e-03   -0.021 0.983160    
## PC127        6.621e-04  1.228e-03    0.539 0.589928    
## PC128       -2.720e-03  1.232e-03   -2.208 0.027323 *  
## PC129        4.891e-04  1.238e-03    0.395 0.692791    
## PC130        2.266e-03  1.235e-03    1.835 0.066600 .  
## PC131       -1.908e-03  1.231e-03   -1.550 0.121240    
## PC132        1.479e-03  1.238e-03    1.195 0.232130    
## PC133       -8.278e-04  1.248e-03   -0.664 0.506991    
## PC134        2.325e-03  1.237e-03    1.880 0.060142 .  
## PC135        1.546e-04  1.234e-03    0.125 0.900329    
## PC136        2.360e-03  1.247e-03    1.892 0.058512 .  
## PC137       -2.384e-03  1.241e-03   -1.921 0.054755 .  
## PC138        1.903e-03  1.252e-03    1.520 0.128544    
## PC139       -1.406e-03  1.243e-03   -1.131 0.258157    
## PC140       -2.393e-03  1.255e-03   -1.907 0.056562 .  
## PC141        6.913e-04  1.248e-03    0.554 0.579753    
## PC142        1.723e-03  1.241e-03    1.388 0.165260    
## PC143        1.536e-03  1.248e-03    1.231 0.218544    
## PC144        2.177e-03  1.256e-03    1.734 0.083045 .  
## PC145        1.942e-03  1.256e-03    1.546 0.122247    
## PC146        5.308e-03  1.249e-03    4.250 2.18e-05 ***
## PC147       -6.797e-04  1.265e-03   -0.538 0.590940    
## PC148       -1.453e-03  1.254e-03   -1.158 0.246750    
## PC149       -2.179e-04  1.260e-03   -0.173 0.862748    
## PC150        1.427e-03  1.258e-03    1.135 0.256456    
## PC151        2.797e-03  1.266e-03    2.210 0.027151 *  
## PC152       -1.777e-03  1.264e-03   -1.406 0.159853    
## PC153        1.449e-03  1.268e-03    1.143 0.252912    
## PC154       -2.017e-03  1.262e-03   -1.599 0.109942    
## PC155        1.731e-03  1.286e-03    1.346 0.178442    
## PC156        2.269e-03  1.283e-03    1.768 0.077124 .  
## PC157        1.070e-04  1.280e-03    0.084 0.933352    
## PC158        8.890e-05  1.282e-03    0.069 0.944731    
## PC159        5.287e-03  1.277e-03    4.141 3.51e-05 ***
## PC160       -9.185e-04  1.287e-03   -0.714 0.475440    
## PC161       -7.134e-04  1.280e-03   -0.558 0.577181    
## PC162       -4.775e-03  1.281e-03   -3.727 0.000196 ***
## PC163        2.188e-03  1.279e-03    1.711 0.087183 .  
## PC164        6.191e-04  1.276e-03    0.485 0.627607    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09829 on 5152 degrees of freedom
## Multiple R-squared:  0.3413, Adjusted R-squared:  0.3204 
## F-statistic: 16.28 on 164 and 5152 DF,  p-value: < 2.2e-16
cd.full2 = plot.diagnostics(model.full2, data.train2)

## [1] "Number of data points that have Cook's D > 4/n: 230"
## [1] "Number of data points that have Cook's D > 1: 0"
# much more normal residuals than before. 
# Checking to see if distributions are different and if so whcih variables
# High Leverage Plot 
plotData = data.train %>% 
  rownames_to_column() %>%
  mutate(type=ifelse(rowname %in% high.cd,'High','Normal')) %>%
  dplyr::select(type,target=one_of(label.names))

ggplot(data=plotData, aes(x=type,y=target)) +
  geom_boxplot(fill='light blue',outlier.shape=NA) +
  scale_y_continuous(name="Target Variable Values",label=scales::comma_format(accuracy=.1)) +
  theme_light() +
  ggtitle('Distribution of High Leverage Points and Normal  Points')

# 2 sample t-tests

plotData = data.train %>% 
  rownames_to_column() %>%
  mutate(type=ifelse(rowname %in% high.cd,'High','Normal')) %>%
  dplyr::select(type,one_of(feature.names))

comp.test = lapply(dplyr::select(plotData, one_of(feature.names))
                   , function(x) t.test(x ~ plotData$type, var.equal = TRUE)) 

sig.comp = list.filter(comp.test, p.value < 0.05)
sapply(sig.comp, function(x) x[['p.value']])
##          PC1         PC11         PC23         PC24         PC25         PC26         PC33         PC39         PC44 
## 2.903635e-06 4.018025e-03 1.732969e-07 1.645539e-02 2.829691e-03 2.346443e-05 8.265634e-05 1.588183e-02 4.231731e-02 
##         PC45         PC54         PC55         PC75        PC153 
## 6.271234e-03 4.323172e-02 4.864219e-02 2.361957e-03 3.993003e-02
mm = melt(plotData, id=c('type')) %>% filter(variable %in% names(sig.comp))

ggplot(mm,aes(x=type, y=value)) +
  geom_boxplot()+
  facet_wrap(~variable, ncol=5, scales = 'free_y') +
  scale_y_continuous(name="values",label=scales::comma_format(accuracy=.1)) +
  ggtitle('Distribution of High Leverage Points and Normal Points')

# Distribution (box) Plots
mm = melt(plotData, id=c('type'))

ggplot(mm,aes(x=type, y=value)) +
  geom_boxplot()+
  facet_wrap(~variable, ncol=8, scales = 'free_y') +
  scale_y_continuous(name="values",label=scales::comma_format(accuracy=.1)) +
  ggtitle('Distribution of High Leverage Points and Normal Points')

Grand Means Model

model.null = lm(grand.mean.formula, data.train)
summary(model.null)
## 
## Call:
## lm(formula = grand.mean.formula, data = data.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.40770 -0.09228 -0.01389  0.07716  0.78451 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.001722   0.001859    2691   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1389 on 5583 degrees of freedom

Variable Selection

Basic: http://www.stat.columbia.edu/~martin/W2024/R10.pdf Cross Validation + Other Metrics: http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/154-stepwise-regression-essentials-in-r/

Forward Selection with CV

Train

if (algo.forward.caret == TRUE){
  set.seed(1)
  returned = train.caret.glmselect(formula = formula
                                   , data = data.train
                                   , method = "leapForward"
                                   , feature.names = feature.names)
  model.forward = returned$model
  id = returned$id
}

Test

if (algo.forward.caret == TRUE){
    test.model(model=model.forward, test=data.test
             ,method = 'leapForward',subopt = NULL
             ,formula = formula, feature.names = feature.names, label.names = label.names
             ,id = id
             ,draw.limits = TRUE, transformation = t)
}

Backward Elimination with CV

Train

if (algo.backward.caret == TRUE){
  set.seed(1)
  returned = train.caret.glmselect(formula = formula
                                   ,data =  data.train
                                   ,method = "leapBackward"
                                   ,feature.names =  feature.names)
  model.backward = returned$model
  id = returned$id
}

Test

if (algo.backward.caret == TRUE){
  test.model(model.backward, data.test
             ,method = 'leapBackward',subopt = NULL
             ,formula = formula, feature.names = feature.names, label.names = label.names
             ,id = id
             ,draw.limits = TRUE, transformation = t)
}

Stepwise Selection with CV

Train

if (algo.stepwise.caret == TRUE){
  set.seed(1)
  returned = train.caret.glmselect(formula = formula
                                   ,data =  data.train
                                   ,method = "leapSeq"
                                   ,feature.names = feature.names)
  model.stepwise = returned$model
  id = returned$id
}

Test

if (algo.stepwise.caret == TRUE){
  test.model(model.stepwise, data.test
             ,method = 'leapSeq',subopt = NULL
             ,formula = formula, feature.names = feature.names, label.names = label.names
             ,id = id
             ,draw.limits = TRUE, transformation = t)
  
}

LASSO with CV

Train

if (algo.LASSO.caret == TRUE){
  set.seed(1)
  tune.grid= expand.grid(alpha = 1,lambda = 10^seq(from=-4,to=-2,length=100))
  returned = train.caret.glmselect(formula = formula
                                   ,data =  data.train
                                   ,method = "glmnet"
                                   ,subopt = 'LASSO'
                                   ,tune.grid = tune.grid
                                   ,feature.names = feature.names)
  model.LASSO.caret = returned$model
}

Test

if (algo.LASSO.caret == TRUE){
  test.model(model.LASSO.caret, data.test
             ,method = 'glmnet',subopt = "LASSO"
             ,formula = formula, feature.names = feature.names, label.names = label.names
             ,draw.limits = TRUE, transformation = t)
}

LARS with CV

Train

if (algo.LARS.caret == TRUE){
  set.seed(1)
  returned = train.caret.glmselect(formula = formula
                                   ,data =  data.train
                                   ,method = "lars"
                                   ,subopt = 'NULL'
                                   ,feature.names = feature.names)
  model.LARS.caret = returned$model
}

Test

if (algo.LARS.caret == TRUE){
  test.model(model.LARS.caret, data.test
             ,method = 'lars',subopt = NULL
             ,formula = formula, feature.names = feature.names, label.names = label.names
             ,draw.limits = TRUE, transformation = t)
}

Session Info

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252    LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                           LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2             knitr_1.20                 htmltools_0.3.6            reshape2_1.4.3            
##  [5] lars_1.2                   doParallel_1.0.14          iterators_1.0.10           caret_6.0-81              
##  [9] leaps_3.0                  ggforce_0.1.3              rlist_0.4.6.1              car_3.0-2                 
## [13] carData_3.0-2              bestNormalize_1.3.0        scales_1.0.0               onewaytests_2.0           
## [17] caTools_1.17.1.1           mosaic_1.5.0               mosaicData_0.17.0          ggformula_0.9.1           
## [21] ggstance_0.3.1             lattice_0.20-35            DT_0.5                     ggiraph_0.6.0             
## [25] investr_1.4.0              glmnet_2.0-16              foreach_1.4.4              Matrix_1.2-14             
## [29] MASS_7.3-50                PerformanceAnalytics_1.5.2 xts_0.11-2                 zoo_1.8-4                 
## [33] forcats_0.3.0              stringr_1.3.1              dplyr_0.7.8                purrr_0.2.5               
## [37] readr_1.3.1                tidyr_0.8.2                tibble_1.4.2               ggplot2_3.1.0             
## [41] tidyverse_1.2.1            usdm_1.1-18                raster_2.8-4               sp_1.3-1                  
## [45] pacman_0.5.0              
## 
## loaded via a namespace (and not attached):
##  [1] readxl_1.2.0       backports_1.1.3    plyr_1.8.4         lazyeval_0.2.1     splines_3.5.1      mycor_0.1.1       
##  [7] crosstalk_1.0.0    leaflet_2.0.2      digest_0.6.18      magrittr_1.5       mosaicCore_0.6.0   openxlsx_4.1.0    
## [13] recipes_0.1.4      modelr_0.1.2       gower_0.1.2        colorspace_1.3-2   rvest_0.3.2        ggrepel_0.8.0     
## [19] haven_2.0.0        crayon_1.3.4       jsonlite_1.5       bindr_0.1.1        survival_2.42-3    glue_1.3.0        
## [25] registry_0.5       gtable_0.2.0       ppcor_1.1          ipred_0.9-8        abind_1.4-5        rngtools_1.3.1    
## [31] bibtex_0.4.2       Rcpp_1.0.0         xtable_1.8-3       units_0.6-2        foreign_0.8-70     stats4_3.5.1      
## [37] lava_1.6.4         prodlim_2018.04.18 htmlwidgets_1.3    httr_1.4.0         RColorBrewer_1.1-2 pkgconfig_2.0.2   
## [43] farver_1.1.0       nnet_7.3-12        labeling_0.3       tidyselect_0.2.5   rlang_0.3.1        later_0.7.5       
## [49] munsell_0.5.0      cellranger_1.1.0   tools_3.5.1        cli_1.0.1          generics_0.0.2     moments_0.14      
## [55] sjlabelled_1.0.17  broom_0.5.1        evaluate_0.12      ggdendro_0.1-20    yaml_2.2.0         ModelMetrics_1.2.2
## [61] zip_2.0.1          nlme_3.1-137       doRNG_1.7.1        mime_0.6           xml2_1.2.0         compiler_3.5.1    
## [67] rstudioapi_0.8     curl_3.2           tweenr_1.0.1       stringi_1.2.4      highr_0.7          gdtools_0.1.7     
## [73] pillar_1.3.1       data.table_1.11.8  bitops_1.0-6       insight_0.1.2      httpuv_1.4.5       R6_2.3.0          
## [79] promises_1.0.1     gridExtra_2.3      rio_0.5.16         codetools_0.2-15   assertthat_0.2.0   pkgmaker_0.27     
## [85] withr_2.1.2        nortest_1.0-4      mgcv_1.8-24        hms_0.4.2          quadprog_1.5-5     grid_3.5.1        
## [91] rpart_4.1-13       timeDate_3043.102  class_7.3-14       rmarkdown_1.11     shiny_1.2.0        lubridate_1.7.4